# base libraries
import pandas as pd
import numpy as np
# preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# feature selection libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# modelling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC,SV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
#model selection & tuning libraries
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
# visualisation libraries
import plotly.figure_factory as ff
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# compute performance libraries
from joblib import Parallel, delayed
# storage & retrieval
import pickle
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
def nulsCount(df):
"""summarise missing/unexpected values"""
d2=pd.DataFrame(columns=["NULL","NAN","BLANKS","UNEXP"])
try:
d2["NULL"] = df.isnull().sum().astype('uint32') # check for null values
d2["NAN"]=df.isna().sum().astype('uint32') # check for NaN
d2["BLANKS"]=df.isin([""," "]).sum().astype('uint32') # check for blanks
d2["UNEXP"]=df.isin(["-","?",".","NA","N/A","Unknown"]).sum().astype('uint32') # check for other unexpected values
except:
pass
d2=d2.loc[(d2["NULL"]!=0) | (d2["NAN"]!=0) | (d2["BLANKS"]!=0) | (d2["UNEXP"]!=0)] # shortlist for the missing values
# convert to percentages
d2["NULL %"] = d2["NULL"].mul(100/df.shape[0]).round(2)
d2["NAN %"] = d2["NAN"].mul(100/df.shape[0]).round(2)
d2["BLANKS %"] = d2["BLANKS"].mul(100/df.shape[0]).round(2)
d2["UNEXP %"] = d2["UNEXP"].mul(100/df.shape[0]).round(2)
# rearrange
d2=d2[["NULL","NULL %","NAN","NAN %","BLANKS","BLANKS %","UNEXP","UNEXP %"]]
if d2.shape[0]==0:
return
else:
return d2
class SCFS():
"""https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
Reference article for feature scoring
SCFS (Standard deviation and Cosine similarity based Feature Selection)
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant"""
def __init__(self,kind='exp'):
"""kind = {'exp','reciprocal','anti-similarity'} default='exp'"""
self.kind=kind
self.fitted=False
def discernibility(self):
"""list down the feature discernibility
same as sample standard deviations"""
m=self.df.shape[0]
self.dis=[np.sqrt(sum((self.df[i]-sum(self.df[i])/m)**2)/(m-1)) for i in self.df.columns]
self.dis=pd.Series(self.dis,index=self.df.columns,dtype=float)
def cosineSimilarity(self):
"""populate the cosine similarities (absolute)"""
self.cosdf=pd.DataFrame(columns=self.df.columns,index=self.df.columns)
for i in self.df.columns:
for j in self.df.columns:
norm_i=np.sqrt(self.df[i].dot(self.df[i]))
norm_j=np.sqrt(self.df[j].dot(self.df[j]))
self.cosdf.loc[i,j] = (np.abs(self.df[i].dot(self.df[j])))/(norm_i*norm_j)
def independence(self):
"""evaluate the feature independance"""
dismaxarg=self.dis.index[np.argmax(self.dis)]
self.ind=pd.Series(index=self.df.columns,dtype=float)
for i in self.df.columns:
if i == dismaxarg: # for feature with max stddev
if self.kind == 'exp':
self.ind[i] = np.exp(max(-self.cosdf.loc[i]))
elif self.kind == 'reciprocal':
self.ind[i] = max(1/self.cosdf.loc[i])
elif self.kind == 'anti-similarity':
self.ind[i] = max(1-self.cosdf.loc[i])
else:
if self.kind == 'exp':
self.ind[i] = np.exp(min(-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index]))
elif self.kind == 'reciprocal':
self.ind[i] = min(1/self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
elif self.kind == 'anti-similarity':
self.ind[i] = min(1-self.cosdf.loc[i,self.dis[self.dis>self.dis[i]].index])
def fit(self,df):
"""evaluate feature scores of df"""
self.df=df.copy()
self.discernibility()
self.cosineSimilarity()
self.independence()
self.fscore=self.dis.mul(self.ind)
self.fitted=True
# delete instance of global variable scoreLog
try:
del scoreLog
print("scoreLog deleted")
except:
print("scoreLog undefined")
# defining a function to report classification metrics
def reporter(Y_train, pred_train, Y_test, pred_test,model_name):
"""Classification report
logs test scores to global dataframe named scoreLog
the scoreLog (with any previous scores) will be displayed
also displays confusion matrices of current instance of arguments
---------------------------------------------------------------------------
Y_train ==> TRUE classes used for training (pandas series object or numpy array of 1-D)
pred_train ==> PREDICTION on training data (pandas series object or numpy array of 1-D)
Y_test ==> TRUE classes to be used for testing (pandas series object or numpy array of 1-D)
pred_test ==> PREDICTION on test data (pandas series object or numpy array of 1-D)
model_name ==> str name for current model, to be used as index for scoreLog
---------------------------------------------------------------------------
"""
from sklearn import metrics
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
global scoreLog
classes=list(Y_test.unique())
cols=["accuracy"]
cols.extend(["precision_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["recall_"+str(classes[i]) for i in range(len(classes))])
cols.extend(["fscore_"+str(classes[i]) for i in range(len(classes))])
try:
type(scoreLog)
except:
scoreLog=pd.DataFrame(columns=cols)
#metrics based on training set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_train, pred_train))
fig1=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_train))),y=list(np.sort(np.unique(Y_train))),
colorscale='Mint',font_colors = ['grey','white'],name="TRAINING SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig1.update_layout(height=350,width=350)
fig1.update_xaxes(title_text="PREDICTED (TRAINING SET) - "+model_name)
fig1.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_train,pred_train)]
score.extend(metrics.precision_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.recall_score(Y_train,pred_train,labels=classes,average=None))
score.extend(metrics.f1_score(Y_train,pred_train,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_training"]).T)
#metrics based on test set
#confusion matrix
z=pd.DataFrame(metrics.confusion_matrix(Y_test, pred_test))
fig2=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z),
x=list(np.sort(np.unique(Y_test))),y=list(np.sort(np.unique(Y_test))),
colorscale='Mint',font_colors = ['grey','white'],name="TEST SET",
hovertemplate="Prediction: %{x:d}<br>True: %{y:d}<br>Count: %{z:d}")
fig2.update_layout(height=350,width=350)
fig2.update_xaxes(title_text="PREDICTED (TEST SET) - "+model_name)
fig2.update_yaxes(title_text="TRUE",tickangle=270)
#scores
score=[metrics.accuracy_score(Y_test,pred_test)]
score.extend(metrics.precision_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.recall_score(Y_test,pred_test,labels=classes,average=None))
score.extend(metrics.f1_score(Y_test,pred_test,labels=classes,average=None))
scoreLog=scoreLog.append(pd.DataFrame(score,index=cols,columns=[model_name+"_test"]).T)
# merge both confusion matrix heatplots
fig=make_subplots(rows=1,cols=2,horizontal_spacing=0.05)
fig.add_trace(fig1.data[0],row=1,col=1)#,name="training data")
fig.add_trace(fig2.data[0],row=1,col=2)#,name="test data")
annot1 = list(fig1.layout.annotations)
annot2 = list(fig2.layout.annotations)
for k in range(len(annot2)):
annot2[k]['xref'] = 'x2'
annot2[k]['yref'] = 'y2'
fig.update_layout(annotations=annot1+annot2)
fig.layout.xaxis.update(fig1.layout.xaxis)
fig.layout.yaxis.update(fig1.layout.yaxis)
fig.layout.xaxis2.update(fig2.layout.xaxis)
fig.layout.yaxis2.update(fig2.layout.yaxis)
fig.layout.yaxis2.update({'title': {'text': ''}})
display(scoreLog)
fig.show()
scoreLog undefined
def cvSplitter(X,Y,k=10,seed=129):
"""Splits K folds and returns array of copied dataframes"""
X=X.copy()
Y=Y.copy()
L=X.shape[0]
# seed pseudo random generator
np.random.seed(seed)
indices=np.random.choice(X.index,L,False)
sets=[(int(np.floor(L*(i)/k)),int(np.floor(L*(i+1)/k))) for i in range(k)]
Xtrains=[]
Xvals=[]
Ytrains=[]
Yvals=[]
ss=0
for i in range(k):
se=int(np.floor(L*(i+1)/k))
Xvals.append(X.loc[list(indices[ss:se])].copy())
Yvals.append(Y.loc[list(indices[ss:se])].copy())
Xtrains.append(X.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
Ytrains.append(Y.loc[list(indices[[j not in indices[ss:se] for j in indices]])].copy())
ss=se
return Xtrains,Ytrains,Xvals,Yvals
class remap():
def __init__(self):
"""performs skew correction and z-score standardisation"""
from sklearn.preprocessing import StandardScaler
self.fitted=False
def fit(self,df):
"""registers stats of the dataframe"""
df=df.copy()
self.fitting_info=pd.DataFrame(columns=["skew","kurt","min","max","reflect","r_min","r_max","mms","log","sqrt"],
index=df.columns)
# initialise flags
self.fitting_info["reflect"] = False
self.fitting_info["mms"] = False
self.fitting_info["log"] = False
self.fitting_info["sqrt"] = False
# reocird basic stats
self.fitting_info["skew"] = df.skew()
self.fitting_info["kurt"] = df.kurt()
self.fitting_info["min"] = df.min()
self.fitting_info["max"] = df.max()
# test need for reflected transforms
collist=list(self.fitting_info.loc[self.fitting_info["skew"]<=-0.75].index)
for col in collist:
# read basic stats
[cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
# reflect
temp_r = cmax+1-df[col]
cmin=temp_r.min()
cmax=temp_r.max()
self.fitting_info.loc[col,["r_min","r_max"]]=[cmin,cmax]
# scale between 0-500
temp_r_mms = (temp_r-cmin)*500/(cmax-cmin)
self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_r_mms.min(),temp_r_mms.max()]
# scaled log tranform
temp_r_mms_l = (temp_r_mms+1).apply(np.log)
# scaled sqrt tranform
temp_r_mms_s = temp_r_mms.apply(np.sqrt)
# plain log tranform
temp_r_l = (temp_r+1).apply(np.log)
# plain sqrt tranform
temp_r_s = temp_r.apply(np.sqrt)
# transformed skews
t_skew = np.abs([temp_r_l.skew(),temp_r_s.skew(),temp_r_mms_l.skew(),temp_r_mms_s.skew()])
# register flags
if round(min(t_skew),2)<round(abs(cskew),2):
self.fitting_info.loc[col,"reflect"]=True
if min(t_skew)==t_skew[0]:
self.fitting_info.loc[col,"log"]=True
df[col]=temp_r_l
elif min(t_skew)==t_skew[1]:
self.fitting_info.loc[col,"sqrt"]=True
df[col]=temp_r_s
elif min(t_skew)==t_skew[2]:
self.fitting_info.loc[col,["log","mms"]]=[True,True]
df[col]=temp_r_mms_l
elif min(t_skew)==t_skew[3]:
self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
df[col]=temp_r_mms_s
# test need for plain transforms
collist=list(self.fitting_info.loc[self.fitting_info["skew"]>=0.75].index)
for col in collist:
# read basic stats
[cskew,cmin,cmax]=self.fitting_info.loc[col,["skew","min","max"]]
# scale between 0-500
temp_mms = (df[col]-cmin)*500/(cmax-cmin)
self.fitting_info.loc[col,["mms_min","mms_max"]]=[temp_mms.min(),temp_mms.max()]
# scaled log tranform
temp_mms_l = (temp_mms+1).apply(np.log)
# scaled sqrt tranform
temp_mms_s = temp_mms.apply(np.sqrt)
# plain log tranform
temp_l = (df[col]+1).apply(np.log)
# plain sqrt tranform
temp_s = df[col].apply(np.sqrt)
# transformed skews
t_skew = np.abs([temp_l.skew(),temp_s.skew(),temp_mms_l.skew(),temp_mms_s.skew()])
# register flags
if round(min(t_skew),2)<round(abs(cskew),2):
if min(t_skew)==t_skew[0]:
self.fitting_info.loc[col,"log"]=True
df[col]=temp_l
elif min(t_skew)==t_skew[1]:
self.fitting_info.loc[col,"sqrt"]=True
df[col]=temp_s
elif min(t_skew)==t_skew[2]:
self.fitting_info.loc[col,["log","mms"]]=True
df[col]=temp_mms_l
elif min(t_skew)==t_skew[3]:
self.fitting_info.loc[col,["sqrt","mms"]]=[True,True]
df[col]=temp_mms_s
# set fitted flag
self.fitted=True
def transform(self,df):
"""perform transforms & scaling"""
if not self.fitted:
raise ValueError("please fit remap")
return
df=df.copy()
for col in df.columns:
# find min max value
cmin = self.fitting_info.loc[col,"min"]
cmax = self.fitting_info.loc[col,"max"]
# 1. reflection
if self.fitting_info.loc[col,"reflect"]:
temp = cmax+1-df[col]
df[col] = temp
# update min max
cmin = self.fitting_info.loc[col,"r_min"]
cmax = self.fitting_info.loc[col,"r_max"]
# 2. min max scaling for log / sqrt
if self.fitting_info.loc[col,"mms"]:
temp = (df[col]-cmin)*500/(cmax-cmin)
df[col] = temp
# update min max
cmin = self.fitting_info.loc[col,"mms_min"]
cmax = self.fitting_info.loc[col,"mms_max"]
# 3. shift data to +ve scale
if cmin<0:
df[col]=df[col]-cmin
if df[col].min()<0: # reconfirm
df[col]=df[col]-df[col].min()
# 4. log transform
if self.fitting_info.loc[col,"log"]:
df[col]=(df[col]+1).apply(np.log)
# 5. sqrt transform
if self.fitting_info.loc[col,"sqrt"]:
df[col]=df[col].apply(np.sqrt)
# 6. reverse Reflection
if self.fitting_info.loc[col,"reflect"]:
temp = np.log(cmax)+1-df[col]
df[col] = temp
# find skew
self.fitting_info.loc[col,"trans_skew"]=df[col].skew()
# find scaled skew
self.fitting_info["trans_scaled_skew"]=df.skew()
return df
def fit_transform(self,df):
"""fit, remap"""
self.fit(df)
df=self.transform(df)
return df
class pandaPoly():
"""PolynomialFeatures extraction and returns Pandas DataFrame"""
from sklearn.preprocessing import PolynomialFeatures
def __init__(self,degree=2, interaction_only=True):
self.poly = self.PolynomialFeatures(degree=2, interaction_only=True)
self.fitted=False
def fit(self,df):
self.poly.fit(df)
self.fitted=True
def transform(self,df):
if self.fitted:
df=df.copy()
d2=pd.DataFrame(self.poly.transform(df),index=df.index)
d2=pd.merge(df,d2,left_index=True,right_index=True)
return d2
else:
raise ValueError("please fit pandaPoly")
def fit_transform(self,df):
self.fit(df)
df=self.transform(df)
return df
class dummies:
"""to implement encoding without data leak"""
def __init__(self):
"""input : dataframe"""
self.ref={}
self.fitted=False
def fit(self,df):
"""Collect required encoding information"""
cat=list(df.select_dtypes(include='object').columns)
for col in cat:
unq=list(df[col].value_counts().index)
self.ref.update({col:unq})
self.fitted=True
return
def transform(self,df):
"""perform encoding"""
df=df.copy()
if not self.fitted:
raise ValueError("please fit first")
return
cat=list(self.ref.keys())
for col in cat:
unq=self.ref.get(col)
for i in unq:
df[col+"_"+str(i)]=df[col]
df.loc[df[col+"_"+str(i)]==i,[col+"_"+str(i)]]=1
df.loc[df[col+"_"+str(i)]!=1,[col+"_"+str(i)]]=0
df.drop(col,axis=1,inplace=True)
df.drop(col+"_"+str(unq[i]),axis=1,inplace=True) #drop_first=True
df = df.apply(pd.to_numeric,errors='ignore',downcast='float',axis=0)
return df
def fit_transform(self,df):
"""learn and encode"""
self.fit(df)
df=self.transform(df)
return df
class pandaCluster():
"""performs KMeans Clustering and returnd Pandas DataFrame with cluster encoded columns"""
def __init__(self,n_clusters=4): # 4 selected for simplicity
self.fitted=False
# models
self.scl = remap()
self.dum = dummies()
self.clt = KMeans(n_clusters=n_clusters)
def fit(self,df):
df=df.copy()
# scale incoming data
df= self.scl.fit_transform(df)
# cluster fitting
self.clt.fit(df)
# encoder fitting for clusters
pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
self.dum.fit(pred)
self.fitted=True
def transform(self,df):
if self.fitted:
df=df.copy()
dforig=df.copy()
# scale data
df= self.scl.transform(df)
# predict clusters
pred=pd.DataFrame(self.clt.predict(df),columns=["CLUSTER"],index=df.index,dtype='object')
# encode cluster columns
pred=self.dum.transform(pred)
# merge with source
df=pd.merge(dforig,pred,left_index=True,right_index=True)
return df
else:
raise ValueError("please fit pandaCluster")
def fit_transform(self,df):
self.fit(df)
df=self.transform(df)
return df
below function implements the following steps
4. Data pre-processing:
B. Check for target balancing and ix it if found imbalanced.
5. Model training, testing and tuning:
A. Use any Supervised Learning technique to train a model.
E. Display and explain the classi ication report in detail.
6. Post Training and Conclusion:
A. Display and compare all the models designed with their train and test accuracies.
def dtc_pipe(X_train, X_test, Y_train, Y_test, mname):
"""basic model+predict+log cycle"""
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
X_test_std = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index) # transform only
# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=129)
dtc.fit(X_train_bal,Y_train_bal)
# predict
pred_train=dtc.predict(X_train_std) # predict sufficient on imbalanced X
pred_test=dtc.predict(X_test_std)
# record scores
reporter(Y_train,pred_train,Y_test,pred_test,mname)
# generate reports (custom-built function : code in the begining of notebook)
DOMAIN: Semiconductor manufacturing process
• CONTEXT:
A complex modern semiconductor manufacturing process is normally under constant surveillance via the monitoring of signals/variables collected from sensors and or process measurement points. However, not all of these signals are equally valuable in a specific monitoring system. The measured signals contain a combination of useful information, irrelevant information as well as noise. Engineers typically have a much larger number of signals than are actually required. If we consider each type of signal as a feature, then feature selection may be applied to identify the most relevant signals. The Process Engineers may then use these signals to determine key factors contributing to yield excursions downstream in the process. This will enable an increase in process throughput, decreased time to learning and reduce the per unit production costs. These signals can be used as features to predict the yield type. And by analysing and trying out different combinations of features, essential signals that are impacting the yield type can be identified.
• DATA DESCRIPTION: sensor-data.csv : (1567, 592)
The data consists of 1567 datapoints each with 591 features. The dataset presented in this case represents a selection of such features where each example represents a single production entity with associated measured features and the labels represent a simple pass/fail yield for in house line testing. Target column “ –1” corresponds to a pass and “1” corresponds to a fail and the data time stamp is for that speci ic test point.
• PROJECT OBJECTIVE:
We will build a classifier to predict the Pass/Fail yield of a particular process entity and analyse whether all the
features are required to build the model or not.
Steps and tasks:
1.Import and understand the data.
A. Import ‘signal-data.csv’ as DataFrame.
B. Print 5 point summary and share at least 2 observations.
# read the dataset
df=pd.read_csv("signal-data.csv")
df.head()
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ... | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 0.0 | 7.9558 | 414.8710 | 10.0433 | 0.9680 | 192.3963 | 12.5190 | 1.4026 | -5419.00 | 2916.50 | -4043.75 | ... | NaN | NaN | NaN | NaN | 533.8500 | 2.1113 | 8.95 | 0.3157 | 3.0624 | 0.1026 | 1.6765 | 14.9509 | NaN | NaN | NaN | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 0.0 | 10.1548 | 414.7347 | 9.2599 | 0.9701 | 191.2872 | 12.4608 | 1.3825 | -5441.50 | 2604.25 | -3498.75 | ... | NaN | NaN | NaN | NaN | 535.0164 | 2.4335 | 5.92 | 0.2653 | 2.0111 | 0.0772 | 1.1065 | 10.9003 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 0.0 | 9.5157 | 416.7075 | 9.3144 | 0.9674 | 192.7035 | 12.5404 | 1.4123 | -5447.75 | 2701.75 | -4047.00 | ... | 0.4122 | 0.2562 | 0.4119 | 68.8489 | 535.0245 | 2.0293 | 11.21 | 0.1882 | 4.0923 | 0.0640 | 2.0952 | 9.2721 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 0.0 | 9.6052 | 422.2894 | 9.6924 | 0.9687 | 192.1557 | 12.4782 | 1.4011 | -5468.25 | 2648.25 | -4515.00 | ... | 3.5611 | 0.0670 | 2.7290 | 25.0363 | 530.5682 | 2.0253 | 9.33 | 0.1738 | 2.8971 | 0.0525 | 1.7585 | 8.5831 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 0.0 | 10.5661 | 420.5925 | 10.3387 | 0.9735 | 191.6037 | 12.4735 | 1.3888 | -5476.25 | 2635.25 | -3987.50 | ... | NaN | NaN | NaN | NaN | 532.0155 | 2.0275 | 8.83 | 0.2224 | 3.1776 | 0.0706 | 1.6597 | 10.9698 | NaN | NaN | NaN | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 592 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
df.select_dtypes(include='int64').columns
Index(['Pass/Fail'], dtype='object')
df.select_dtypes(include='object').columns
Index(['Time'], dtype='object')
df.select_dtypes(include='object').describe()
| Time | |
|---|---|
| count | 1567 |
| unique | 1534 |
| top | 2008-10-15 01:52:00 |
| freq | 3 |
every column is a numeric data except for Time column
later, lets see if we could extract features from Time column else drop it
Also the Time column seems to have duplicates, which could be the same with all the other columns too.
need to confirm to drop those.
# typecast to datetime
df.Time=pd.to_datetime(df.Time)
# print 5 point summary
df.describe(datetime_is_numeric=True)[-5:]
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ... | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25% | 2008-07-09 15:32:00 | 2966.260000 | 2452.247500 | 2181.044400 | 1081.87580 | 1.01770 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | -0.005600 | 0.958100 | 198.130700 | 0.0 | 7.094875 | 406.127400 | 9.567625 | 0.968200 | 188.299825 | 12.460000 | 1.396500 | -5933.250000 | 2578.000000 | -4371.750000 | ... | 2.090200 | 0.038200 | 1.884400 | 15.466200 | 530.702700 | 1.982900 | 7.500000 | 0.242250 | 2.56785 | 0.075100 | 1.408450 | 11.501550 | 0.01380 | 0.01060 | 0.003400 | 46.184900 | 0.497900 | 0.01160 | 0.00310 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.00000 |
| 50% | 2008-08-23 13:02:00 | 3011.490000 | 2499.405000 | 2201.066700 | 1285.21440 | 1.31680 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | 0.000400 | 0.965800 | 199.535600 | 0.0 | 8.967000 | 412.219100 | 9.851750 | 0.972600 | 189.664200 | 12.499600 | 1.406000 | -5523.250000 | 2664.000000 | -3820.750000 | ... | 2.150450 | 0.048650 | 1.999700 | 16.988350 | 532.398200 | 2.118600 | 8.650000 | 0.293400 | 2.97580 | 0.089500 | 1.624500 | 13.817900 | 0.02040 | 0.01480 | 0.004700 | 72.288900 | 0.500200 | 0.01380 | 0.00360 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.00000 |
| 75% | 2008-09-22 11:16:30 | 3056.650000 | 2538.822500 | 2218.055500 | 1591.22350 | 1.52570 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | 0.005900 | 0.971300 | 202.007100 | 0.0 | 10.861875 | 419.089275 | 10.128175 | 0.976800 | 192.189375 | 12.547100 | 1.415000 | -5356.250000 | 2841.750000 | -3352.750000 | ... | 3.098725 | 0.075275 | 2.970850 | 24.772175 | 534.356400 | 2.290650 | 10.130000 | 0.366900 | 3.49250 | 0.112150 | 1.902000 | 17.080900 | 0.02770 | 0.02000 | 0.006475 | 116.539150 | 0.502375 | 0.01650 | 0.00410 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.00000 |
| max | 2008-12-10 18:47:00 | 3356.350000 | 2846.440000 | 2315.266700 | 3715.04170 | 1114.53660 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | 0.053000 | 0.984800 | 272.045100 | 0.0 | 19.546500 | 824.927100 | 102.867700 | 0.984800 | 215.597700 | 12.989800 | 1.453400 | 0.000000 | 3656.250000 | 2363.000000 | ... | 14.014100 | 0.293200 | 12.746200 | 84.802400 | 589.508200 | 2.739500 | 454.560000 | 2.196700 | 170.02040 | 0.550200 | 90.423500 | 96.960100 | 0.10280 | 0.07990 | 0.028600 | 737.304800 | 0.509800 | 0.47660 | 0.10450 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.00000 |
| std | NaN | 73.621787 | 80.407705 | 29.513152 | 441.69164 | 56.35554 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | 0.009302 | 0.012452 | 3.257276 | 0.0 | 2.796596 | 17.221095 | 2.403867 | 0.012062 | 2.781041 | 0.217965 | 0.016737 | 626.822178 | 295.498535 | 1380.162148 | ... | 1.032761 | 0.032761 | 0.996644 | 10.213294 | 17.499736 | 0.275112 | 86.304681 | 0.248478 | 26.92015 | 0.067791 | 16.921369 | 12.485267 | 0.01173 | 0.00964 | 0.003116 | 87.520966 | 0.003404 | 0.01718 | 0.00372 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.49801 |
5 rows × 592 columns
There are few constant columns like "13","42",...
There are few extreme skewed or quasi-constant columns like "4","21"...
There are few near-perfect bell curves like "24"
Need to review and remove columns that doesn't add information to target In reference to the target, the dataset seems imbalanced as more than 75% of data corresponds to -1
2.Data cleansing:
A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.
B. Identify and drop the features which are having same value for all the rows.
# verify if target column has nans
df["Pass/Fail"].isna().sum()
0
safe to continue without dropping any records
%%time
#lets review the nulls
nulsCount(df)
#(custom-built function : code in the begining of notebook)
CPU times: user 253 ms, sys: 39.5 ms, total: 292 ms Wall time: 291 ms
| NULL | NULL % | NAN | NAN % | BLANKS | BLANKS % | UNEXP | UNEXP % | |
|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 0.38 | 6 | 0.38 | 0 | 0.0 | 0 | 0.0 |
| 1 | 7 | 0.45 | 7 | 0.45 | 0 | 0.0 | 0 | 0.0 |
| 2 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| 3 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| 4 | 14 | 0.89 | 14 | 0.89 | 0 | 0.0 | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 585 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 586 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 587 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 588 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
| 589 | 1 | 0.06 | 1 | 0.06 | 0 | 0.0 | 0 | 0.0 |
538 rows × 8 columns
%%time
# lets review least number of uniques in the features
df.nunique().sort_values()[:5]
CPU times: user 121 ms, sys: 0 ns, total: 121 ms Wall time: 156 ms
262 1 263 1 264 1 265 1 266 1 dtype: int64
# benchmark shape
df.shape
(1567, 592)
df_raw=df.copy()
%%time
for col in df.columns:
if df[col].nunique()==1: # features having same values for all rows
df.drop([col],axis=1,inplace=True)
elif df[col].isnull().sum()/df.shape[0]>0.2: # features with 20%+ Null values
df.drop([col],axis=1,inplace=True)
elif df[col].isnull().sum()>0: # features having at least 1 null
df[col].fillna(df[col].mean().astype('float32'),inplace=True)
CPU times: user 547 ms, sys: 7.34 ms, total: 554 ms Wall time: 551 ms
# review shape
df.shape
(1567, 444)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 444 entries, Time to Pass/Fail dtypes: datetime64[ns](1), float64(442), int64(1) memory usage: 5.3 MB
# review nulls
nulsCount(df)
none found, hence lets proceed
----------------------------------------------------------------------------
Let us set a base line model using DecisionTreeClassifier
# seperate predictors & targets
X = df[df.columns[1:-1]]
Y = df[df.columns[-1]]
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=129)
dtc.fit(X_train,Y_train)
# predict
pred_train=dtc.predict(X_train)
pred_test=dtc.predict(X_test)
# record scores
reporter(Y_train,pred_train,Y_test,pred_test,"DTC_raw")
# generate reports (custom-built function : code in the begining of notebook)
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
pretty impressive accuracy and low execution time
but unfortunately the precision, recall and f1_score for FAIL class (+1) is very poor
they are poor in training data prediction, probably because of imbalanced data
in the test data prediction, those have fallen even lower, indicating over-fit model
Lets build on our modelling
before proceeding further, lets extract some timestamp features & inherent clusters
# benchmark
X_train.shape
(1253, 442)
the below snippet adds to the following step
2.Data cleansing:
E. Make all relevant modi ications on the data using both functional/logical reasoning/assumptions.
# lets extract some features from the date time
X["Year"]=df.Time.dt.year
X["Mon"]=df.Time.dt.month
X["day"]=df.Time.dt.day
X["day_of_week"]=df.Time.dt.day_of_week
X["day_of_year"]=df.Time.dt.day_of_year
X["weekofyear"]=df.Time.dt.isocalendar().week
X["Hour"]=df.Time.dt.hour
X["Min"]=df.Time.dt.minute
X["Sec"]=df.Time.dt.second
X["Qtr"]=df.Time.dt.quarter
# Train & Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.20, # split ratio of 80:20
random_state=129) # random seed
# review
X_train.shape
(1253, 452)
%%time
# lets add few features about the inherent clusters in the dataset
clt=pandaCluster() # custom class : code in the begining of notebook
X_train_clt=clt.fit_transform(X_train)
X_test_clt=clt.transform(X_test)
CPU times: user 5.23 s, sys: 82.7 ms, total: 5.32 s Wall time: 2.89 s
# review
X_train_clt.shape
(1253, 455)
below function implements the following steps
4. Data pre-processing:
B. Check for target balancing and ix it if found imbalanced.
5. Model training, testing and tuning:
A. Use any Supervised Learning technique to train a model.
E. Display and explain the classification report in detail.
6. Post Training and Conclusion:
A. Display and compare all the models designed with their train and test accuracies.
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_clt, X_test_clt, Y_train, Y_test,"DTC2_time_clt")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
the model perfromance has significantly improved in terms of FAIL class
probably caused by the combined effect of feature additons, standardisation & target class balancing
2.Data cleansing:
C. Drop other features if required using relevant functional knowledge. Clearly justify the same.
# let us review spread of all features
stddev=pd.DataFrame(X_train_clt.std(),columns=["stddev"])
gdata=list(stddev.stddev)
fig = ff.create_distplot([gdata],['Stdandard Deviations'],
curve_type='kde',show_hist=True,
show_rug=True
)
fig.update_layout(height=500,width=1000,showlegend=False)
fig.show()
stddev.describe(percentiles=[0.1,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9]).T
| count | mean | std | min | 10% | 20% | 25% | 30% | 40% | 50% | 60% | 70% | 75% | 80% | 90% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| stddev | 455.0 | 74.49824 | 435.643811 | 0.0 | 0.009242 | 0.027431 | 0.05275 | 0.075622 | 0.25302 | 1.063889 | 2.917544 | 6.515314 | 9.233332 | 17.836337 | 61.769207 | 6522.814779 |
the variances (or standard deviations) of several fearutes are condensed below unity
this indicates a that several features would not contribute to the model learning
though z-score tranformation will shift & rescale the distributions, it would also leverage all the noises in the data towards model learning
hence let us use few feature selection techniques to shrink our dataset
# let us remove any quasi-constant features
quasi = VarianceThreshold(threshold=0.01) #quasi-constant ness of 1%
X_train_quasi=pd.DataFrame(quasi.fit_transform(X_train_clt),
columns=X_train_clt.columns[quasi.get_support()],index=X_train_clt.index)
X_test_quasi=pd.DataFrame(quasi.transform(X_test_clt),
columns=X_train_clt.columns[quasi.get_support()],index=X_test_clt.index)
X_train_quasi.shape
(1253, 306)
X_train_clt.shape[1]-X_train_quasi.shape[1]
149
149 Quasi constant features were trimmed off leaving behind 306 features
# review model performance
# custom pipe : code in the begining of notebook
dtc_pipe(X_train_quasi, X_test_quasi, Y_train, Y_test,"DTC3_quasi")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
| DTC3_quasi_training | 0.642458 | 0.971391 | 0.119835 | 0.636829 | 0.725000 | 0.769310 | 0.205674 |
| DTC3_quasi_test | 0.585987 | 0.949438 | 0.110294 | 0.582759 | 0.625000 | 0.722222 | 0.187500 |
though the test scores seems to have reduced, the quantum of features dropped is a good trade off against it
lets study furhter
SCFS (Standard deviation and Cosine similarity based Feature Selection)
Reference article for feature scoring
using custom method based on published paper from
https://www.frontiersin.org/articles/10.3389/fgene.2021.684100/full
Credits to: Juanying Xie, Mingzhao Wang, Shengquan Xu, Zhao Huang and Philip W. Grant
Explanation & Justification to use the method
The discernibility of a feature, refers to its distinguishable capability between categories
Feature selection aims to detect the features whose distinguishable capability is strong while the redundancy between them is less
To represent the redundancy between a feature and the other features, cosine similarity is used
Feature independence is deduced from cosine similarity ( in 3 possible ways)
The method guarantees that a feature will have the maximal independence as far as possible once it has the maximal discernibility
%%time
# custom class code written at the beginning of the notebook
scfs=SCFS(kind='exp')
# evaluate feature scores
scfs.fit(X_train_quasi)
# lets review the feature scores
fig=go.Figure()
gdata=scfs.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs.dis,y=scfs.ind,mode='markers',name='discernibility vs independence'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()
CPU times: user 17.6 s, sys: 73.3 ms, total: 17.7 s Wall time: 17.5 s
the feature discernibility scales has over powered the feature independence scale, thus the above curve seems asymptotic to axes
let us perform standardisation and then use the SCFS technique
%%time
# custom class code written at the beginning of the notebook
scfs_std=SCFS(kind='exp')
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_quasi),
columns=X_train_quasi.columns,index=X_train_quasi.index)
# evaluate feature scores
scfs_std.fit(X_train_std)
# lets review the feature scores
fig=go.Figure()
gdata=scfs_std.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_std.dis,y=scfs_std.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()
CPU times: user 17.3 s, sys: 101 ms, total: 17.4 s Wall time: 17.2 s
standard scaler has changed all feature discernibility to unity rendering no meaningful information
let us try minmaxscaler
%%time
# custom class code written at the beginning of the notebook
scfs_mima=SCFS(kind='exp')
# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
columns=X_train_quasi.columns,index=X_train_quasi.index)
# evaluate feature scores
scfs_mima.fit(X_train_mima)
# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima.dis,y=scfs_mima.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()
CPU times: user 17.1 s, sys: 59.1 ms, total: 17.1 s Wall time: 17.1 s
# review potential trimmed feature count
(scfs_mima.fscore>0.2).sum()
44
the above feature scoring plot seems meaningful with an approximate elbow formed around a certain feature score
let us try other 2 independence score kinds using same minmaxscaler
%%time
# custom class code written at the beginning of the notebook
scfs_mima_reci=SCFS(kind='reciprocal')
# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
columns=X_train_quasi.columns,index=X_train_quasi.index)
# evaluate feature scores
scfs_mima_reci.fit(X_train_mima)
# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima_reci.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=np.log(gdata),name='feature score')) # graph y-scale enhanced
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="log(scores)-->")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima_reci.dis,y=np.log(scfs_mima_reci.ind),mode='markers')) # graph y-scale enhanced
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="log(independence)-->")
fig.show()
CPU times: user 17.7 s, sys: 101 ms, total: 17.8 s Wall time: 17.6 s
# review potential trimmed feature count
(np.log(scfs_mima_reci.fscore)>0).sum()
31
reciprocal method gives significantly reduced number of features
%%time
# custom class code written at the beginning of the notebook
scfs_mima_as=SCFS(kind='anti-similarity')
# standardize
mima=MinMaxScaler(feature_range=(-1,1))
X_train_mima = pd.DataFrame(mima.fit_transform(X_train_quasi),
columns=X_train_quasi.columns,index=X_train_quasi.index)
# evaluate feature scores
scfs_mima_as.fit(X_train_mima)
# lets review the feature scores
fig=go.Figure()
gdata=scfs_mima_as.fscore.sort_values()
fig.add_trace(go.Scatter(x=gdata.index, y=gdata,name='feature score'))
fig.update_xaxes(title="features-->")
fig.update_yaxes(title="scores-->")
fig.show()
fig=go.Figure()
fig.add_trace(go.Scatter(x=scfs_mima_as.dis,y=scfs_mima_as.ind,mode='markers'))
fig.update_xaxes(title="discernibility-->")
fig.update_yaxes(title="independence-->")
fig.show()
CPU times: user 17.6 s, sys: 92.4 ms, total: 17.6 s Wall time: 17.5 s
# review potential trimmed feature count
(scfs_mima_as.fscore>0.2).sum()
35
since reciproval method produces a better elbow, and returns minimal features
lets choose reciprocal independence method with threshold of 0 scores in log scale
%%time
X_train_SCFS=X_train_mima.copy()
# cur progress
dims=X_train_SCFS.shape
# iteratively reduce features
# using reciprocal method with log threshold of 0
scfs_iter=SCFS(kind='reciprocal')
scfs_iter.fit(X_train_SCFS)
logscore=np.log(scfs_iter.fscore)
thresher=logscore.min()
while thresher<0:
ind=logscore.argmin()
feat=scfs_iter.fscore.index[ind]
X_train_SCFS.drop(feat,axis=1,inplace=True)
scfs_iter.fit(X_train_SCFS)
logscore=np.log(scfs_iter.fscore)
thresher=logscore.min()
dims=X_train_SCFS.shape
CPU times: user 30min 14s, sys: 3.99 s, total: 30min 18s Wall time: 30min 10s
# review trimmed shape
X_train_SCFS.shape
(1253, 34)
# review final feature scores
np.log(scfs_iter.fscore).describe()
count 34.000000 mean 0.874429 std 1.385611 min 0.037813 25% 0.203459 50% 0.422168 75% 0.964522 max 7.779985 dtype: float64
all low scored features have been removed leaving 34 features to go ahead
# the above trimmed dataset output is from mimax scalled data
# hence lets obtain same features from original unscaled data
cols=scfs_iter.fscore.index
#filter
X_train_SCFS=X_train_quasi[cols].copy()
X_test_SCFS=X_test_quasi[cols].copy()
# review shape
X_train_SCFS.shape
(1253, 34)
# lets test model performance
dtc_pipe(X_train_SCFS, X_test_SCFS, Y_train, Y_test,"DTC4_SCFS")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
| DTC3_quasi_training | 0.642458 | 0.971391 | 0.119835 | 0.636829 | 0.725000 | 0.769310 | 0.205674 |
| DTC3_quasi_test | 0.585987 | 0.949438 | 0.110294 | 0.582759 | 0.625000 | 0.722222 | 0.187500 |
| DTC4_SCFS_training | 0.705507 | 0.953725 | 0.106267 | 0.720375 | 0.487500 | 0.820787 | 0.174497 |
| DTC4_SCFS_test | 0.684713 | 0.940092 | 0.113402 | 0.703448 | 0.458333 | 0.804734 | 0.181818 |
though the scores seems to have reduced, the divide between training and testing scores have greately reduced, inferring significant reduction in data noise
this justifies the power of SCFS methodology
Let us try to study feature importances from the DTree classifier
%%time
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_SCFS),
columns=X_train_SCFS.columns,index=X_train_SCFS.index)
X_test_std = pd.DataFrame(scl.transform(X_test_SCFS),
columns=X_test_SCFS.columns,index=X_test_SCFS.index)
# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)
# model learning
dtc=DecisionTreeClassifier(criterion = 'gini', random_state=129)
dtc.fit(X_train_bal,Y_train_bal)
# store feature importances in sequence
fimp = pd.DataFrame(dtc.feature_importances_,index=X_train_bal.columns)
fimp=fimp.sort_values(by=0,ascending=True).index
CPU times: user 266 ms, sys: 12.1 ms, total: 278 ms Wall time: 69.6 ms
fimp=pd.DataFrame(dtc.feature_importances_,index=X_train_bal.columns).sort_values(by=0,ascending=False)
fig=go.Figure()
y=fimp[0]
x=list(fimp.index)
fig.add_trace(go.Bar(x=x,y=y,name='importance'))
y=np.array(fimp[0]).cumsum()
fig.add_trace(go.Scatter(x=x,y=y,name='cumulative importance'))
fig.update_xaxes(title="features --->")
fig.update_yaxes(title="importance --->")
fig.show()
the above feature importance plot shows a gradual increase/decrease of influence by every next feature
we cannot justify to drop any further features from the above, hence lets move on
-----------------------------------------------------------------------------------------------
by now, the following project statements have been covered in various sections and mentioned here to keep track
2.Data cleansing:
D. Check for multi-collinearity in the data and take necessary action.
# verify correlation in base data
((abs(X_train_quasi.corr())>0.75).sum().sum()-X_train_quasi.shape[1])/2
492.0
there had been 491 cases of multi-colinearity pairs within just 306 features after quasi constant feature elimination
# verify correlation in SCFS data
((abs(X_train_SCFS.corr())>0.75).sum().sum()-X_train_SCFS.shape[1])/2
1.0
there are no case of high correlation, since already SCFS has taken dependence of features in to consideration for feature scoring
z=pd.DataFrame(X_train_SCFS.corr())
fig=ff.create_annotated_heatmap(np.array(z),annotation_text=np.array(z).round(1),
colorscale='RdBu',zmin=-1,zmax=1,font_colors = ['Blue','Grey'])
fig.update_layout(height=1000,width=1000)
it could be seen that the maximum correlation is 0.7 or -0.6, hence there is not much multicolinearity, except for countably 2 or 3 pairs
lets investigate further, using Variance Inflation Factors
by definition, the variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression. It is a measure for multicollinearity of the design matrix, exog.
One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables, and the parameter estimates will have large standard errors because of this.
hence features having VIF above 5 needs to be studied for dropping
%%time
# let us drop features for VIF > 5
X_train_vif=X_train_SCFS.copy()
#obtain vif
cols=X_train_vif.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_train_vif.values, i) for i in range(len(cols))]
#lets display the vif summary before trimming
print("for ",X_train_vif.shape[1]," features")
display(vif.describe().T)
while vif.max()[0]>5:
col=vif.index[np.argmax(vif["VIF"])] # select top vif column
X_train_vif.drop(col,axis=1,inplace=True)
#recompute VIF
del vif
cols=X_train_vif.columns
vif = pd.DataFrame(index=cols)
vif["VIF"]=[variance_inflation_factor(X_train_vif.values, i) for i in range(len(cols))]
# lets review the vif after trimming
print("\nfor ",X_train_vif.shape[1]," features")
display(vif.describe().T)
for 34 features
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| VIF | 34.0 | 154.336511 | 440.343532 | 1.599582 | 2.603413 | 10.50677 | 23.499655 | 2349.301543 |
for 14 features
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| VIF | 14.0 | 2.195455 | 0.677902 | 1.242474 | 1.696825 | 2.069178 | 2.624982 | 3.40984 |
CPU times: user 2.94 s, sys: 72.2 ms, total: 3.02 s Wall time: 1 s
# lets study the model performance
#test data selection
X_test_vif = X_test_SCFS[X_train_vif.columns]
dtc_pipe(X_train_vif, X_test_vif, Y_train, Y_test,"DTC5_vif")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
| DTC3_quasi_training | 0.642458 | 0.971391 | 0.119835 | 0.636829 | 0.725000 | 0.769310 | 0.205674 |
| DTC3_quasi_test | 0.585987 | 0.949438 | 0.110294 | 0.582759 | 0.625000 | 0.722222 | 0.187500 |
| DTC4_SCFS_training | 0.705507 | 0.953725 | 0.106267 | 0.720375 | 0.487500 | 0.820787 | 0.174497 |
| DTC4_SCFS_test | 0.684713 | 0.940092 | 0.113402 | 0.703448 | 0.458333 | 0.804734 | 0.181818 |
| DTC5_vif_training | 0.660016 | 0.959410 | 0.106818 | 0.664962 | 0.587500 | 0.785498 | 0.180769 |
| DTC5_vif_test | 0.617834 | 0.938144 | 0.100000 | 0.627586 | 0.500000 | 0.752066 | 0.166667 |
the recal scores have improved, with significant drop of features
yet going further, let us test the models with both SCFS trimmed & VIF Trimmed data
2.Data cleansing:
E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions.
%%time
# lets try skew corrections in the data
rmp=remap()
# custom class code written at the beginning of the notebook
X_train_vif_rmp=rmp.fit_transform(X_train_vif)
X_test_vif_rmp=rmp.transform(X_test_vif)
# lets study the model performance
dtc_pipe(X_train_vif_rmp, X_test_vif_rmp, Y_train, Y_test,"DTC6_vif+remap")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
| DTC3_quasi_training | 0.642458 | 0.971391 | 0.119835 | 0.636829 | 0.725000 | 0.769310 | 0.205674 |
| DTC3_quasi_test | 0.585987 | 0.949438 | 0.110294 | 0.582759 | 0.625000 | 0.722222 | 0.187500 |
| DTC4_SCFS_training | 0.705507 | 0.953725 | 0.106267 | 0.720375 | 0.487500 | 0.820787 | 0.174497 |
| DTC4_SCFS_test | 0.684713 | 0.940092 | 0.113402 | 0.703448 | 0.458333 | 0.804734 | 0.181818 |
| DTC5_vif_training | 0.660016 | 0.959410 | 0.106818 | 0.664962 | 0.587500 | 0.785498 | 0.180769 |
| DTC5_vif_test | 0.617834 | 0.938144 | 0.100000 | 0.627586 | 0.500000 | 0.752066 | 0.166667 |
| DTC6_vif+remap_training | 0.656026 | 0.960298 | 0.107383 | 0.659847 | 0.600000 | 0.782213 | 0.182163 |
| DTC6_vif+remap_test | 0.605096 | 0.941489 | 0.103175 | 0.610345 | 0.541667 | 0.740586 | 0.173333 |
CPU times: user 423 ms, sys: 8.05 ms, total: 431 ms Wall time: 146 ms
%%time
# lets try skew corrections in the data
rmp=remap()
# custom class code written at the beginning of the notebook
X_train_SCFS_rmp=rmp.fit_transform(X_train_SCFS)
X_test_SCFS_rmp=rmp.transform(X_test_SCFS)
# lets study the model performance
dtc_pipe(X_train_SCFS_rmp, X_test_SCFS_rmp, Y_train, Y_test,"DTC7_SCFS+remap")
| accuracy | precision_-1 | precision_1 | recall_-1 | recall_1 | fscore_-1 | fscore_1 | |
|---|---|---|---|---|---|---|---|
| DTC_raw_training | 0.950519 | 0.952730 | 0.846154 | 0.996590 | 0.275000 | 0.974167 | 0.415094 |
| DTC_raw_test | 0.914013 | 0.925566 | 0.200000 | 0.986207 | 0.041667 | 0.954925 | 0.068966 |
| DTC2_time_clt_training | 0.865922 | 0.958904 | 0.221519 | 0.895141 | 0.437500 | 0.925926 | 0.294118 |
| DTC2_time_clt_test | 0.863057 | 0.942652 | 0.228571 | 0.906897 | 0.333333 | 0.924429 | 0.271186 |
| DTC3_quasi_training | 0.642458 | 0.971391 | 0.119835 | 0.636829 | 0.725000 | 0.769310 | 0.205674 |
| DTC3_quasi_test | 0.585987 | 0.949438 | 0.110294 | 0.582759 | 0.625000 | 0.722222 | 0.187500 |
| DTC4_SCFS_training | 0.705507 | 0.953725 | 0.106267 | 0.720375 | 0.487500 | 0.820787 | 0.174497 |
| DTC4_SCFS_test | 0.684713 | 0.940092 | 0.113402 | 0.703448 | 0.458333 | 0.804734 | 0.181818 |
| DTC5_vif_training | 0.660016 | 0.959410 | 0.106818 | 0.664962 | 0.587500 | 0.785498 | 0.180769 |
| DTC5_vif_test | 0.617834 | 0.938144 | 0.100000 | 0.627586 | 0.500000 | 0.752066 | 0.166667 |
| DTC6_vif+remap_training | 0.656026 | 0.960298 | 0.107383 | 0.659847 | 0.600000 | 0.782213 | 0.182163 |
| DTC6_vif+remap_test | 0.605096 | 0.941489 | 0.103175 | 0.610345 | 0.541667 | 0.740586 | 0.173333 |
| DTC7_SCFS+remap_training | 0.712690 | 0.954190 | 0.108939 | 0.728048 | 0.487500 | 0.825919 | 0.178082 |
| DTC7_SCFS+remap_test | 0.694268 | 0.936937 | 0.108696 | 0.717241 | 0.416667 | 0.812500 | 0.172414 |
CPU times: user 475 ms, sys: 8.03 ms, total: 483 ms Wall time: 192 ms
skew correction has decreased the scores in case of VIF trimmed dataset
where as for SCFS trimmed dataset there is no change in scores
apart from above skew correction,
timestamp feature extraction
and cluster feature extraction
were performed earlier in the notebook
# let us check for duplicate rows
display(X_train_SCFS.duplicated().sum())
display(X_train_vif.duplicated().sum())
0
0
there are no duplicate records
# let us check for duplicate features
display(X_train_SCFS.T.duplicated().sum())
display(X_train_vif.T.duplicated().sum())
0
0
as expected, the SCFC method and VIF methods would have removed any duplicated features out of similarity & collinearity
3. Data analysis & visualisation:
A. Perform a detailed univariate Analysis with appropriate detailed comments after each analysis.
B. Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis.
# lets study the columns in order of importance
passind=Y_train.loc[Y_train==-1].index
failind=Y_train.loc[Y_train==1].index
def univar(X,col):
"""display univariate plots"""
a=X[col].loc[passind]
b=X[col].loc[failind]
if X[col].nunique()>10:
fig = make_subplots(rows=2,cols=1)
fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',
bin_size=int(X[col].nunique()/10),show_hist=True,show_rug=False)
fig.add_trace(go.Histogram(fig2.data[0],marker=dict(color='#1E90FF',opacity=0.4)),1,1)
fig.add_trace(go.Histogram(fig2.data[1],marker=dict(color='#FF4500',opacity=0.4)),1,1)
fig.add_trace(go.Scatter(fig2.data[2],line=dict(color='#1E90FF')),1,1)
fig.add_trace(go.Scatter(fig2.data[3],line=dict(color='#FF4500')),1,1)
text="continuous feature %s<br>pass mean %.2f, fail mean %.2f\
<br>pass median %.2f, fail median %.2f"%(col,a.mean(),b.mean(),a.median(),b.median())
fig.update_layout(title=text)
fig.add_trace(go.Box(x=a,name='Pass',hovertemplate='%{x}',jitter=1,marker=dict(color='#1E90FF')),2,1)
fig.add_trace(go.Box(x=b,name='Fail',hovertemplate='%{x}',jitter=1,marker=dict(color='#FF4500')),2,1)
fig.show()
else:
fig=go.Figure()
fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],show_curve=True,show_hist=True,show_rug=False)
fig.add_trace(go.Histogram(fig2.data[0],xbins=dict(start=0,end=X[col].nunique(),size=0.5),
marker=dict(color='#1E90FF',opacity=0.4)))
fig.add_trace(go.Histogram(fig2.data[1],xbins=dict(start=0,end=X[col].nunique(),size=0.5),
marker=dict(color='#FF4500',opacity=0.4)))
fig.add_trace(go.Scatter(fig2.data[2],line=dict(color='#1E90FF')))
fig.add_trace(go.Scatter(fig2.data[3],line=dict(color='#FF4500')))
text="discrete feature %s<br>pass mean %.2f, fail mean %.2f"%(col,a.mean(),b.mean())
fig.update_layout(title=text)
fig.show()
return a,b
#lets study the features in VIF trimmed dataset
cols=X_train_vif.columns
univar(X_train_vif,cols[0]);
the data follows a normal distribution, but with several extreme values on either sides means of pass & fail classes are not far off compared to the range of the data
univar(X_train_vif,cols[1]);
follows bell curve
difference in target class means found
univar(X_train_vif,cols[2]);
does not follow normal distribution
twin peaks found, one near 0, and another near 400
difference in target class means found
univar(X_train_vif,cols[3]);
does not follow normal distribution
twin peaks found, one near 0, and another near 400
difference in target class means found, but on the opposite direction from previous feature
univar(X_train_vif,cols[4]);
univar(X_train_vif,cols[5]);
above two features exhibit similar distributions
yet SCFS & VIF methods has confimed that these are not related
univar(X_train_vif,cols[6]);
a skewed bell curve found
central tendencies are close for target classes
univar(X_train_vif,cols[7]);
twin peak found
univar(X_train_vif,cols[8]);
heavy peak found close to zero, causing high skewed distribution
univar(X_train_vif,cols[9]);
univar(X_train_vif,cols[10]);
above two features exhibit similar distributions
yet SCFS & VIF methods has confimed that these are not related
univar(X_train_vif,cols[11]);
close to uniform distribution
univar(X_train_vif,cols[12]);
univar(X_train_vif,cols[13]);
above two features exhibite near uniform distribution
last two features are synthesised features of inherent data clusters
doesn't follow any distribution
Let plot a bivariate pair plot and study further
gdata=X_train_vif.merge(Y_train.astype('object'),how='inner',left_index=True,right_index=True)
dims=gdata.columns
fig = px.scatter_matrix(gdata,dimensions=dims,color='Pass/Fail',symbol='Pass/Fail',opacity=0.5,height=800)
fig.show()
from the above plot one may not be able to decipher any relations as the datapoints have clouded all over their space
let us study further
passind=Y_train.loc[Y_train==-1].index
failind=Y_train.loc[Y_train==1].index
def bivar(X,a,b):
x1=X[a].loc[passind]
x2=X[a].loc[failind]
y1=X[b].loc[passind]
y2=X[b].loc[failind]
fig1=go.Scatter(x=x1,y=y1,mode='markers',marker=dict(color="#1E90FF",opacity=1,size=5),name='Pass')
fig2=go.Scatter(x=x2,y=y2,mode='markers',marker=dict(color="#FF4500",opacity=1,size=6),name='Fail')
fig3=go.Histogram2dContour(x=x2,y=y2,showscale=False,reversescale=True,ncontours=12,name='FailDensity',
colorscale=[[0.0, 'rgb(150,150,150)'], [1.0, 'rgb(255, 255, 255)']],showlegend=True)
fig=go.Figure()
fig.add_trace(fig1)
fig.add_trace(fig2)
fig.add_trace(fig3)
fig.update_layout(height=650,width=650)
fig.update_xaxes(title="Feature "+a)
fig.update_yaxes(title="Feature "+b)
fig.show()
# let us study multivariate distributions of top 5 important features in VIF trimmed dataset
cols=fimp[fimp.index.isin(X_train_vif.columns)].index
for i,a in enumerate(cols):
for b in cols[(i+1):5]:
bivar(X_train_vif,a,b)
input("comments: ")
comments: the fail class seems to be spread over the entire region, more concetrated in the etream values
comments: the fail class is concentrated around 0.5 of feature 129, and majorily around 250 of feature 486
comments: triple peaks of failure occurances found around (0,0),(400,0),(250,400)
comments: failure classes occur along the 2 bud shaped zones in a majority
comments: peaking failures around (500,0.5)
comments: two peak zones (0-500,0) and (500,400) found to be more prone to failure
comments: more concetration close to origin
comments: two peaks observed around (0.5,0) and (0.5,400)
comments: very high probabilty around the origin
comments: upto 5 clusters could be found, with prominent ones around origin and around (400,100)
4. Data pre-processing:
D. Check if the train and test data have similar statistical characteristics when compared with original data.
# lets create a table of feature summary characteristics
# training predictors
X_train_stats=X_train_SCFS.describe()[1:].T
X_train_stats["skew"]=X_train_SCFS.skew()
X_train_stats["kurt"]=X_train_SCFS.kurt()
# testing predictors
X_test_stats=X_test_SCFS.describe()[1:].T
X_test_stats["skew"]=X_test_SCFS.skew()
X_test_stats["kurt"]=X_test_SCFS.kurt()
# targets
Y_stats=pd.DataFrame(index=["train","test"],columns=["-1 count","1 count","ratio -1:1"])
stat=Y_train.value_counts()
Y_stats.iloc[0]=[stat.loc[-1],stat.loc[1],stat.loc[-1]/stat.loc[1]]
stat=Y_test.value_counts()
Y_stats.iloc[1]=[stat.loc[-1],stat.loc[1],stat.loc[-1]/stat.loc[1]]
display(Y_stats)
| -1 count | 1 count | ratio -1:1 | |
|---|---|---|---|
| train | 1173 | 80 | 14.6625 |
| test | 290 | 24 | 12.083333 |
the target class is almost equally distributed in the training & testing dataset
# lets compare the stats with a random column
def ttcomp(col):
n=col
fig=make_subplots(rows=1,cols=2)
#------------------------------
cols=X_train_stats.columns[:-2]
fig.add_trace(go.Bar(x=cols,y=X_train_stats.loc[n,cols],
marker=dict(color="#87CEEB",opacity=0.8),name="training"),1,1)
fig.add_trace(go.Bar(x=cols,y=X_test_stats.loc[n,cols],
marker=dict(color="#A9A9A9",opacity=0.8),name="testing"),1,1)
#------------------------------
cols=X_train_stats.columns[-2:]
fig.add_trace(go.Bar(x=cols,y=X_train_stats.loc[n,list(cols)],
marker=dict(color="#87CEEB",opacity=0.8),name="training",showlegend=False),1,2)
fig.add_trace(go.Bar(x=cols,y=X_test_stats.loc[n,list(cols)],
marker=dict(color="#A9A9A9",opacity=0.8),name="testing",showlegend=False),1,2)
fig.update_layout(title="comparison of train & test data ditribution w.r.t. column %s"%n)
fig.show()
#------------------------------
# lets visualise the distribution
fig=go.Figure()
#------------------------------
passind=Y_train.loc[Y_train==-1].index
failind=Y_train.loc[Y_train==1].index
a=X_train_SCFS.loc[passind,n]
b=X_train_SCFS.loc[failind,n]
fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',show_hist=False,show_rug=False)
fig.add_trace(go.Scatter(fig2.data[0],line=dict(color='#1E90FF'),name='train/pass'))
fig.add_trace(go.Scatter(fig2.data[1],line=dict(color='#FF4500'),name='train/fail'))
#------------------------------
passind=Y_test.loc[Y_test==-1].index
failind=Y_test.loc[Y_test==1].index
a=X_test_SCFS[n].loc[passind]
b=X_test_SCFS[n].loc[failind]
fig2 = ff.create_distplot([a,b],['Pass', 'Fail'],curve_type='kde',show_hist=False,show_rug=False)
fig.add_trace(go.Scatter(fig2.data[0],line=dict(color='#7B68EE'),name='test/pass'))
fig.add_trace(go.Scatter(fig2.data[1],line=dict(color='#CD5C5C'),name='test/fail'))
fig.show()
ttcomp(fimp.index[10])
most of the statistical indices of the feature are closely similar in training & testing dataset
5. Model training, testing and tuning:
A. Use any Supervised Learning technique to train a model.
B. Use cross validation techniques.
Hint: Use all CV techniques that you have learnt in the course.
C.
Apply hyper-parameter tuning techniques to get the best accuracy.
Suggestion: Use all possible hyper parameter combinations to extract the best accuracies.
D.
Use any other technique/method which can enhance the model performance.
Hint: Dimensionality reduction, attribute removal, standardisation/normalisation, target balancing etc.
E. Display and explain the classi ication report in detail.
F. Apply the above steps for all possible models that you have learnt so far.
try:
del scoreLog
print("scoreLog deleted")
except:
print("scoreLog undefined")
scoreLog deleted
# lets encode target class to from (pass,fail)=(-1,1) to (0,1)
Y_train.loc[Y_train==-1]=0
Y_test.loc[Y_test==-1]=0
models = [ XGBClassifier(objective='reg:logistic',n_jobs=-1,eval_metric='rmse',use_label_encoder=False),
DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1),
RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12),
LogisticRegression(solver="liblinear"),
KNeighborsClassifier(n_neighbors= 5 , weights = 'distance'),
SVC(gamma=0.025, C=3)
]
xgbcp=dict(eta=np.arange(0.01,0.2,0.01),
max_depth=np.arange(3,10,1),
colsample_bytree=np.arange(0.5,1,0.1),
)
lrp=dict(C=np.logspace(-5, 8, 15))
dtcp= {"max_depth": [3, None],
"max_features": np.arange(5,15,1),
"min_samples_leaf": np.arange(1,9,1),
"criterion": ["gini", "entropy"]}
rdcp=dtcp
kncp=dict(n_neighbors=np.arange(3,9,2),
weights=['uniform','distance'],
leaf_size=np.arange(10,100,5))
svcp=dict(C=np.arange(1,10,1),
kernel=['linear','rbf'])
params=[xgbcp,dtcp,rdcp,lrp,kncp,svcp]
%%time
# learn first model with KFold CrossValidation
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
model=models[0]
results=cross_val_score(model,X_train_vif,Y_train,cv=kfold)
CPU times: user 5.43 s, sys: 36.2 ms, total: 5.46 s Wall time: 717 ms
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
results.mean(),results.std()))
[0.92063492 0.88095238 0.96825397 0.936 0.928 0.92 0.944 0.92 0.936 0.92 ] Mean Accuracy: 0.93 95% confidence interval: 0.02
%%time
# lets try another LOOCV
loocv = LeaveOneOut()
model=models[0]
results=cross_val_score(model,X_train_vif,Y_train,cv=loocv)
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
results.mean(),results.std()))
[1. 1. 1. ... 1. 0. 1.] Mean Accuracy: 0.93 95% confidence interval: 0.26 CPU times: user 11min 7s, sys: 6.05 s, total: 11min 13s Wall time: 1min 25s
the LOOCV provides a caution about widened confidence interval,
yet consumes more compute times.
for the upcoming models lets stick to KFold
# before tuning, lets review test scores
model.fit(X_train_vif,Y_train)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_vif)))
Test Accuracy : 0.92
lower than train data
%%time
tune=RandomizedSearchCV(estimator=model,param_distributions=xgbcp,
cv=10,scoring="accuracy");
tune.fit(X_train_vif,Y_train);
CPU times: user 43.3 s, sys: 965 ms, total: 44.3 s Wall time: 5.59 s
RandomizedSearchCV(cv=10,
estimator=XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=1,
eval_metric='rmse', gamma=0,
gpu_id=-1, importance_type='gain',
interaction_constraints='',
learning_rate=0.300000012,
max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan,
monotone_constraints='()',
n_estimators...
reg_lambda=1, scale_pos_weight=1,
subsample=1, tree_method='exact',
use_label_encoder=False,
validate_parameters=1,
verbosity=None),
param_distributions={'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9]),
'eta': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19]),
'max_depth': array([3, 4, 5, 6, 7, 8, 9])},
scoring='accuracy')
tune.best_params_.keys()
dict_keys(['max_depth', 'eta', 'colsample_bytree'])
%%time
#lets fit to tuned parameters
model.set_params(**tune.best_params_)
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
results=cross_val_score(model,X_train_vif,Y_train,cv=kfold)
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
results.mean(),results.std()))
[0.92063492 0.88888889 0.97619048 0.944 0.928 0.928 0.952 0.936 0.936 0.928 ] Mean Accuracy: 0.93 95% confidence interval: 0.02 CPU times: user 5.3 s, sys: 32 ms, total: 5.33 s Wall time: 681 ms
# lets review test scores
model.fit(X_train_vif,Y_train)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_vif)))
Test Accuracy : 0.92
Test accuracy not improved
%%time
# lets use all other techniques
# standardize
scl=StandardScaler()
X_train_std = pd.DataFrame(scl.fit_transform(X_train_vif),
columns=X_train_vif.columns,index=X_train_vif.index)
X_test_std = pd.DataFrame(scl.transform(X_test_vif),
columns=X_test_vif.columns,index=X_test_vif.index)
# balance training data set
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train_bal, Y_train_bal = balancer.fit_resample(X_train_std,Y_train)
#hypertune
tune=RandomizedSearchCV(estimator=model,param_distributions=xgbcp,
cv=10,scoring="accuracy");
tune.fit(X_train_bal,Y_train_bal)
#lets fit to tuned parameters
model.set_params(**tune.best_params_)
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
results=cross_val_score(model,X_train_bal,Y_train_bal,cv=kfold)
# lets review results
print(results)
print("\nMean Accuracy: %.2f\n95%% confidence interval: %.2f"%(
results.mean(),results.std()))
model.fit(X_train_bal,Y_train_bal)
print("Test Accuracy : %.2f"%metrics.accuracy_score(Y_test,model.predict(X_test_std)))
[0.96595745 0.95319149 0.97021277 0.98723404 0.95319149 0.97446809 0.98717949 0.98717949 0.97435897 0.96153846] Mean Accuracy: 0.97 95% confidence interval: 0.01 Test Accuracy : 0.92 CPU times: user 1min 30s, sys: 616 ms, total: 1min 31s Wall time: 11.5 s
test accuracy dropped lower
need more trials
lets build a custome pipe
scoreLog=pd.DataFrame(columns=["model_obj","Train_Acc","Test_Acc"])
def train_test_tune(estimator, p_grid, dset, scaler, skew_corr, mname):
clf=estimator
[X_train,Y_train,X_test,Y_test]=dset
# balance
balancer = SMOTE(sampling_strategy='not majority', random_state=129)
X_train, Y_train = balancer.fit_resample(X_train,Y_train)
# skew correction
if skew_corr:
rmp=remap()
X_train=rmp.fit_transform(X_train)
X_test=rmp.transform(X_test)
# scale
if scaler=="z-score":
scl=StandardScaler()
elif scaler=="minmax":
scl=MinMaxScaler()
X_train = pd.DataFrame(scl.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
X_test = pd.DataFrame(scl.transform(X_test),columns=X_test.columns,index=X_test.index)
#hypertune
tune=RandomizedSearchCV(estimator=clf,param_distributions=p_grid,
cv=10,scoring="accuracy")
tune.fit(X_train,Y_train)
#lets check tuned accuracy
clf.set_params(**tune.best_params_)
kfold=KFold(n_splits=10,random_state=129,shuffle=True)
results=cross_val_score(clf,X_train,Y_train,cv=kfold)
#lets fit the model
clf.fit(X_train,Y_train)
testscore=metrics.accuracy_score(Y_test,model.predict(X_test))
#lets store the case
score=[clf,results.mean(),testscore]
scoreLog.loc[mname]=score
return X_test
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[0],xgbcp,dset,'minmax',True,"XGBC_mima_skew")
#review Scores
scoreLog
CPU times: user 1min 17s, sys: 708 ms, total: 1min 18s Wall time: 9.91 s
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[1],params[1],dset,'z-score',True,"DTC_Z_skew")
#review Scores
scoreLog
CPU times: user 2.58 s, sys: 28.1 ms, total: 2.61 s Wall time: 1.35 s
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
| DTC_Z_skew | DecisionTreeClassifier(criterion='entropy', ma... | 0.891737 | 0.898089 |
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[2],params[2],dset,'z-score',True,"RDC_Z_skew")
#review Scores
scoreLog
CPU times: user 28.4 s, sys: 72 ms, total: 28.4 s Wall time: 27.2 s
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
| DTC_Z_skew | DecisionTreeClassifier(criterion='entropy', ma... | 0.891737 | 0.898089 |
| RDC_Z_skew | (DecisionTreeClassifier(max_features=6, random... | 0.964199 | 0.898089 |
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[3],params[3],dset,'minmax',True,"LR_mima_skew")
#review Scores
scoreLog
CPU times: user 1.71 s, sys: 15.9 ms, total: 1.73 s Wall time: 663 ms
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
| DTC_Z_skew | DecisionTreeClassifier(criterion='entropy', ma... | 0.891737 | 0.898089 |
| RDC_Z_skew | (DecisionTreeClassifier(max_features=6, random... | 0.964199 | 0.898089 |
| LR_mima_skew | LogisticRegression(C=2275.845926074791, solver... | 0.658125 | 0.914013 |
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[4],params[4],dset,'z-score',True,"KNC_Z_skew")
#review Scores
scoreLog
CPU times: user 2.59 s, sys: 43.9 ms, total: 2.64 s Wall time: 1.51 s
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
| DTC_Z_skew | DecisionTreeClassifier(criterion='entropy', ma... | 0.891737 | 0.898089 |
| RDC_Z_skew | (DecisionTreeClassifier(max_features=6, random... | 0.964199 | 0.898089 |
| LR_mima_skew | LogisticRegression(C=2275.845926074791, solver... | 0.658125 | 0.914013 |
| KNC_Z_skew | KNeighborsClassifier(leaf_size=35, n_neighbors... | 0.849514 | 0.898089 |
%%time
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
train_test_tune(models[5],params[5],dset,'z-score',True,"SVC_Z_skew")
#review Scores
scoreLog
CPU times: user 31.8 s, sys: 35.4 ms, total: 31.9 s Wall time: 30.8 s
| model_obj | Train_Acc | Test_Acc | |
|---|---|---|---|
| XGBC_mima_skew | XGBClassifier(base_score=0.5, booster='gbtree'... | 0.967181 | 0.914013 |
| DTC_Z_skew | DecisionTreeClassifier(criterion='entropy', ma... | 0.891737 | 0.898089 |
| RDC_Z_skew | (DecisionTreeClassifier(max_features=6, random... | 0.964199 | 0.898089 |
| LR_mima_skew | LogisticRegression(C=2275.845926074791, solver... | 0.658125 | 0.914013 |
| KNC_Z_skew | KNeighborsClassifier(leaf_size=35, n_neighbors... | 0.849514 | 0.898089 |
| SVC_Z_skew | SVC(C=9, gamma=0.025) | 0.928803 | 0.898089 |
the best model was XGBClassifer owing to excellent boosting trees
# select best model
ind=scoreLog["Test_Acc"].argmax()
best=scoreLog.loc[scoreLog.index[ind],["model_obj"]]
#pickle it
pickle.dump(best[0],open("best_model.bhar",'wb'))
#pass dataset
dset=[X_train_vif,Y_train,X_test_vif,Y_test]
# fit model
X_tested=train_test_tune(models[0],xgbcp,dset,'minmax',True,"XGBC_mima_skew")
print(metrics.classification_report(Y_test,best[0].predict(X_tested)))
precision recall f1-score support
0 0.92 0.99 0.95 290
1 0.00 0.00 0.00 24
accuracy 0.91 314
macro avg 0.46 0.49 0.48 314
weighted avg 0.85 0.91 0.88 314
the accuracy has come to 91%, but test recall has been very poor, due to over fit